rm(list=ls())

#####################
#The purpose of this file is to load up the view data and clean it for processing.
#
# Part 0 - set globals and make helpers
# Part 1 - load and clean revisions data
# Part 2 - load and clean view/quality data
#
#####################

# Part 0 - set globals and make helpers
#basePath = '/home/kaylea/Research/taboo/'
basePath = '/gscratch/comdata/users/kaylea/taboo/'
#coefPath = paste0(basePath, 'processed_data/coefs/')
coefPath = paste0(basePath, 'processed_data/euph/')
ngramPath = paste0(basePath, 'processed_data/ngram/') #ngram sample lives here
dataPath = paste0(basePath, 'processed_data/')
#source(paste0(basePath, 'libs/lib-00-utils.R'))
#load(paste0(knitrPath, 'sample_metadata.RData'))
#botsFile <- ('raw_data/botList.tsv')
load(paste0(dataPath, 'dataset1.RData'))

library(dplyr)
library(sqldf)
library(lubridate)
library(data.table)
library(arrow)

#recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/
readPileToDF <- function(path) {
  file_list <- list.files(path)
  for (my_file in file_list){
    if (my_file == '_SUCCESS') { #spark metadata file, ignore
      next
    }
    # if the merged dataset doesn't exist, create it
    if (!exists("dataset")) {
    print(paste0('Now Reading: ', path, my_file))
    dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
    }
    # if the merged dataset does exist, append to it
    if (exists("dataset")){
      temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
      dataset<-rbind(dataset, temp_dataset)
      rm(temp_dataset)
    }
  }
  dataset <- unique(dataset)
  return(dataset)
}

# Part 2 - load and clean view and quality data

## working through handling view and quality separately

#vDF.CTab = readPileToDF(paste0(coefPath, 'vData/'))
t <- open_dataset(paste0(coefPath, 'vData/'))
vDF.CTab <- collect(t)
vDF.CTab$source <- "taboo" 
vDF.CTab$titlePred <- NA

#vDF.NGram = readPileToDF(paste0(ngramPath,'vData/'))

### alternate approach now that I've got parquet files:

t <- open_dataset(paste0(ngramPath, 'vData/'))
vDF.NGram <- collect(t)
vDF.NGram$source <- "ngram" 



## clean up cols 
vDF.NGram$articleID.x <- NULL
vDF.NGram$articleID.y <- NULL
vDF.NGram$target.x <- NULL
vDF.NGram$target.y <- NULL
vDF.NGram$articleID_x <- NULL
vDF.NGram$articleID_y <- NULL
vDF.NGram$target_x <- NULL
vDF.NGram$target_y <- NULL
vDF.NGram$prediction.x <- NULL
vDF.NGram$prediction.y <- NULL
vDF.NGram$articlePred <- NULL
vDF.NGram$titlePred <- NULL
vDF.NGram$chosenTarget <- NULL
vDF.NGram$articleID <- NULL
vDF.NGram$target <- NULL
vDF.NGram$title <- NULL
vDF.NGram$ngram <- NULL
vDF.NGram$count <- NULL
vDF.NGram$yearMonth <- vDF.NGram$monthYear
vDF.NGram$monthYear <- NULL
vDF.NGram$filtered_title <- NULL

vDF.CTab$filtered_title <- NULL
vDF.CTab$titlePred <- NULL
vDF.CTab$articlePred <- NULL
vDF.CTab$articleID <- NULL
vDF.CTab$target <- NULL
vDF.CTab$target <- NULL
vDF.CTab$yearMonth <- vDF.CTab$monthYear
vDF.CTab$monthYear <- NULL



head(vDF.NGram)
head(vDF.CTab)
vDF.NGram$prediction <- NULL
vDF.CTab$prediction <- NULL
vDF.CTab$articleID <- vDF.CTab$target <- NULL
vDF <- rbind(vDF.NGram, vDF.CTab) 
vDF <- unique(vDF)
head(vDF)
vDF$prediction <- NULL
vDF <- vDF[!is.na(vDF$viewSum),] #drop any where viewSum is NA


vDF$date = gsub('-', '/', vDF$yearMonth)
vDF$date = paste0(vDF$date, '/01') #trying to make date happy
vDF$date = as.Date(vDF$date, "%Y/%m/%d")
#vDF$weighted_sum = as.numeric(vDF$weighted_sum)


#numViewReadings <- vDF %>% group_by(encodedTitle) %>% dplyr::summarize(numViewReadings=count()) ##articlewise revisions count
numViewReadings <- vDF %>% group_by(encodedTitle) %>% dplyr::summarize(numViewReadings=n()) ##articlewise revisions count
n.view.readings <- length(vDF$encodedTitle) ## total number of revisions
n.arts <- length(numViewReadings$encodedTitle) ## total number of articles
vDF <- merge(vDF, numViewReadings, by="encodedTitle")
vDF <- merge(vDF, numEditors, by="encodedTitle")
vDF$weight <- (n.view.readings/n.arts)/vDF$numViewReadings
vDF.NGram <- subset(vDF, vDF$source == "ngram")
vDF.CTab <- subset(vDF, vDF$source == "taboo")




#b <- open_dataset(paste0(coefPath, 'taboo/qData/'))
b <- open_dataset(paste0(coefPath, 'qData/'))
qDF.CTab <- collect(b)
#qDF.CTab = readPileToDF(paste0(coefPath, 'qData/'))
qDF.CTab$source <- "taboo" 
qDF.CTab$titlePred <- NULL
qDF.CTab$filtered_title <- NULL


t <- open_dataset(paste0(ngramPath, 'qData/'))
qDF.NGram <- collect(t)
#qDF.NGram = readPileToDF(paste0(ngramPath, 'qData/'))
qDF.NGram$source <- "ngram" 
qDF.NGram$articlePred   <- NULL

## clean up cols 
qDF.NGram$articleID.y <- NULL
qDF.NGram$prediction.x <- NULL
qDF.NGram$prediction.y <- NULL

head(qDF.NGram)
head(qDF.CTab)

qDF.CTab$prediction <- NULL
qDF.CTab$title <- NULL

qDF.NGram$prediction <- NULL
qDF.NGram$titlePred <- NULL
qDF.NGram$chosenTarget <- NULL
qDF.NGram$articleID <- NULL
qDF.NGram$target <- NULL
qDF.NGram$title <- NULL
qDF.NGram$filtered_title <- NULL
qDF.NGram$articleID_x  <- NULL
qDF.NGram$articleID_y  <- NULL
qDF.NGram$target_x  <- NULL
qDF.NGram$target_y  <- NULL
qDF.NGram$ngram  <- NULL
qDF.NGram$count  <- NULL

qDF.CTab$articleID <- qDF.CTab$target <- NULL
qDF <- rbind(qDF.NGram, qDF.CTab) 
qDF <- unique(qDF)
head(qDF)
qDF$prediction <- NULL
qDF <- qDF[!is.na(qDF$page_id),] #drop any where revid is NA


qDF$date = gsub('-', '/', qDF$yearMonth)
qDF$date = paste0(qDF$date, '/01') #trying to make date happy
qDF$date = as.Date(qDF$date, "%Y/%m/%d")
qDF$weighted_sum = as.numeric(qDF$weighted_sum)



birthDF <-  qDF %>% group_by(encodedTitle) %>% dplyr::summarize(birthday=min(date))
qDF <- merge(qDF, birthDF, by='encodedTitle')
atBirthDF <- subset(qDF, qDF$date==qDF$birthday)
qDF$monthsOld <- (interval(qDF$birthday, qDF$date) %/% months(1)) + 1 # e.g. if equal, we are seeing "first month of life"
qDF$yearsOld <- (interval(qDF$birthday, qDF$date) %/% years(1)) + 1 # e.g. if equal, we are seeing "first month of life"

#numQualityReadings <- qDF %>% group_by(encodedTitle) %>% dplyr::summarize(numQualityReadings=count()) ##articlewise revisions count
numQualityReadings <- qDF %>% group_by(encodedTitle) %>% dplyr::summarize(numQualityReadings=n()) ##articlewise revisions count
n.quality.readings <- length(qDF$weighted_sum) ## total number of revisions
n.arts <- length(qDF$encodedTitle) ## total number of articles
qDF <- merge(qDF, numQualityReadings, by="encodedTitle")
qDF$weight <- (n.quality.readings/n.arts)/qDF$numQualityReadings
qDF.NGram <- subset(qDF, qDF$source == 'ngram')
qDF.CTab <- subset(qDF, qDF$source == 'taboo')




#ultimateVQE <- sqldf("select encodedTitle, source, weighted_sum, date, monthsOld, yearsOld, max(date) as Value from qDF group by encodedTitle", drv = 'SQLite')
#ultimateVQE <- merge(ultimateVQE, numEdits, by='encodedTitle')
#ultimateVQE <- merge(ultimateVQE, numEditors, by='encodedTitle')

save.image(paste0(dataPath, "dataset2.RData"), version=2)
#save(qDF, atBirthDF, file=paste0(dataPath, "forViz2.RData"), version=2)
  
